Importing Libraries and setting cwd
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.4
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(naniar)
library(leaflet)
library(htmltools)
library(choroplethr)
## Loading required package: acs
## Loading required package: XML
##
## Attaching package: 'acs'
## The following object is masked from 'package:htmltools':
##
## span
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:base':
##
## apply
library(dplyr)
library(choroplethrMaps)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:acs':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
library(ggmap)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
library(htmlwidgets)
library(mapview)
library(dplyr)
library(plotly)
library(devtools)
## Loading required package: usethis
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(here)
## here() starts at C:/Users/Sanjana Gupta/Desktop/Winter 2020/Data 598C Data Science Process/Data Science Project
#setwd("~/Desktop/Winter 2020/Data 598C Data Science Process/Data Science Project")
#setwd(here("/Data Science Project"))
Importing data and creating a training dataset
data1 <- read.csv(file = "data/listings.csv", header = TRUE)
data2 <- read.csv(file = "data/calendar.csv")
data3 <- read.csv(file = "data/reviews.csv")
data_train <- data1
Viewing the data in different formats
#View(data_train)
#to view data kinda nicely - uses dplyr
#glimpse(data_train)
#str(data1)
#summary(data_train)
#head(data_train, 1000)
#tail(data_train, 50)
#cols_to_delete <- c("last_scraped", "thumbnail_url", "host_picture_url", "medium_url", "picture_url", "xl_picture_url", "host_thumbnail_url", "host_picture_url")
#select(data_train, -contains("url"))
data_train$last_scraped <- NULL
data_train$thumbnail_url <- NULL
data_train$host_picture_url <- NULL
data_train$medium_url <- NULL
data_train$picture_url <- NULL
data_train$xl_picture_url <- NULL
data_train$host_thumbnail_url <- NULL
data_train$host_picture_url <- NULL
data_train$host_url <- NULL
data_train$scrape_id <- NULL
data_train$experiences_offered <- NULL
data_train$neighborhood_overview <- NULL
data_train$host_about <- NULL
data_train$host_id <- NULL
data_train$host_verifications <- NULL
data_train$host_has_profile_pic <- NULL
data_train$host_identity_verified <- NULL
data_train$calendar_last_scraped <- NULL
data_train$license <- NULL
data_train$instant_bookable <- NULL
data_train$require_guest_profile_picture <- NULL
data_train$require_guest_phone_verification <- NULL
data_train$summary <- NULL
data_train$notes <- NULL
#Down to 69 columns from initial 92
rownames(data_train) <- NULL
#Viewing the data
#glimpse(data_train)
#summary(data_train$is_location_exact)
data_train$price <- sub("\\$","",data_train$price)
data_train$price <- sub(",","",data_train$price)
data_train$price <- as.integer(data_train$price)
data_train$host_response_time <- as.factor(data_train$host_response_time)
data_train$host_is_superhost <- as.factor(data_train$host_is_superhost)
data_train$neighbourhood_cleansed <- as.factor(data_train$neighbourhood_cleansed)
data_train$is_location_exact <- as.factor(data_train$is_location_exact)
data_train$property_type <- as.factor(data_train$property_type)
data_train$room_type <- as.factor(data_train$room_type)
data_train$bed_type <- as.factor(data_train$bed_type)
data_train$calendar_updated <- as.factor(data_train$calendar_updated)
data_train$cancellation_policy <- as.factor(data_train$cancellation_policy)
data_train$host_response_rate<- as.numeric(sub("%", "", data_train$host_response_rate))
## Warning: NAs introduced by coercion
data_train$host_response_rate <- data_train$host_response_rate/100
data_train$extra_people <- as.numeric(sub("\\$","",data_train$extra_people))
0 %in% data_train$price
## [1] FALSE
All Airbnb listings have a price associated with it
data_train %>% filter_all(all_vars(!is.null(.)))